Cleaning of 1983-2021 U.S. Crude Oil Production Data and Performing A Tme Series Analysis

This data set, “Weekly_US_Crude_Oil.csv”, contains the weekly U.S. field production of crude oil in Thousand Barrels per day. The data is from the first week of 1983 to the second week of February 2021. The code below cleans this data to create time series line plots.

We begin by loading the data into R work space and calling the various supportive libraries for cleaning

# Loading the data into R

produnclean <- read.csv("C:\\Users\\User\\Desktop\\attachment_2 (4).csv", header= F, strip.white=T, skip =3)

#Inspecting the Loaded Data

head(produnclean)
##         V1     V2       V3     V4       V5     V6       V7     V8       V9
## 1 1983-Jan 01/07  8,634    01/14  8,634    01/21  8,634    01/28  8,634   
## 2 1983-Feb 02/04  8,660    02/11  8,660    02/18  8,660    02/25  8,660   
## 3 1983-Mar 03/04  8,677    03/11  8,677    03/18  8,677    03/25  8,677   
## 4 1983-Apr 04/01  8,677    04/08  8,686    04/15  8,686                   
## 5 1983-May                 05/13  8,682    05/20  8,682                   
## 6 1983-Jun 06/03  8,676    06/10  8,676    06/17  8,676    06/24  8,676   
##      V10      V11
## 1                
## 2                
## 3                
## 4 04/29  8,686   
## 5                
## 6
str(produnclean)
## 'data.frame':    458 obs. of  11 variables:
##  $ V1 : chr  "1983-Jan" "1983-Feb" "1983-Mar" "1983-Apr" ...
##  $ V2 : chr  "01/07 " "02/04 " "03/04 " "04/01 " ...
##  $ V3 : chr  "8,634   " "8,660   " "8,677   " "8,677   " ...
##  $ V4 : chr  "01/14 " "02/11 " "03/11 " "04/08 " ...
##  $ V5 : chr  "8,634   " "8,660   " "8,677   " "8,686   " ...
##  $ V6 : chr  "01/21 " "02/18 " "03/18 " "04/15 " ...
##  $ V7 : chr  "8,634   " "8,660   " "8,677   " "8,686   " ...
##  $ V8 : chr  "01/28 " "02/25 " "03/25 " "" ...
##  $ V9 : chr  "8,634   " "8,660   " "8,677   " "" ...
##  $ V10: chr  "" "" "" "04/29 " ...
##  $ V11: chr  "" "" "" "8,686   " ...
names(produnclean)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
#Beginning the Cleaning Process by eliminating unnecessary rows and columns

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.1
library(stringr)

library (tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v forcats 0.5.1
## v readr   2.0.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library (lubridate)
## Warning: package 'lubridate' was built under R version 4.1.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
attach(produnclean)

Next, we split the month column and generate complete dates to be later converted into POSIX

#Generating Week One's full dates

produnclean <- produnclean %>% separate(V1, into =c('Year', 'Month'), sep='-') %>% subset(select=-Month)

Week1Date<- paste(produnclean$Year, "/", produnclean$V2)

produnclean <- produnclean %>% mutate(Week1Date)%>% subset(select=-V2)

#Generating Week two's full dates

Week2Date<- paste(produnclean$Year, "/", produnclean$V4)

produnclean <- produnclean %>% mutate(Week2Date)%>% subset(select=-V4)

#Generating Week three's full dates

Week3Date<- paste(produnclean$Year, "/", produnclean$V6)

produnclean <- produnclean %>% mutate(Week3Date)%>% subset(select=-V6)

#Generating week four's full dates

Week4Date<- paste(produnclean$Year, "/", produnclean$V8)

produnclean <- produnclean %>% mutate(Week4Date)%>% subset(select=-V8)

#Generating week five's full dates

Week5Date<- paste(produnclean$Year, "/", produnclean$V10)

produnclean <- produnclean %>% mutate(Week5Date)%>% subset(select=-V10)%>% subset(select=-Year)

We then assign the separated dates their correct production values by first joining them into a single string

#Matching week dates to their production values

produnclean <- produnclean %>% unite('Allweek1prod',Week1Date,V3, sep='_') %>% unite('Allweek2prod',Week2Date,V5, sep='_') %>% unite('Allweek3prod',Week3Date,V7, sep='_') %>% unite('Allweek4prod',Week4Date,V9, sep='_') %>% unite('Allweek5prod',Week5Date,V11, sep='_')

We then split the newly formed data frame of all five week’s production across the years into different data frames for narrowed down organization to generate one final data frame containing the two primary columns: Date and Production

#Sub-setting the various production weeks into different data frames for easier analysis

##For week 1

produncleanWk1 <- produnclean %>% select(Allweek1prod)

produncleanWk1 <- produncleanWk1 %>% separate(Allweek1prod, into =c('Date', 'Production'), sep ='_')

##For Week 2

produncleanWk2 <- produnclean %>% select(Allweek2prod)

produncleanWk2 <- produncleanWk2 %>% separate(Allweek2prod, into =c('Date', 'Production'), sep ='_')

##For Week 3

produncleanWk3 <- produnclean %>% select(Allweek3prod)

produncleanWk3 <- produncleanWk3 %>% separate(Allweek3prod, into =c('Date', 'Production'), sep ='_')

##For Week 4

produncleanWk4 <- produnclean %>% select(Allweek4prod)

produncleanWk4 <- produncleanWk4 %>% separate(Allweek4prod, into =c('Date', 'Production'), sep ='_')

##For Week 5

produncleanWk5 <- produnclean %>% select(Allweek5prod)

produncleanWk5 <- produncleanWk5 %>% separate(Allweek5prod, into =c('Date', 'Production'), sep ='_')
#Merging the 5 weeks' data frames into one

cleanprodlist <- list(produncleanWk1, produncleanWk2, produncleanWk3,produncleanWk4,produncleanWk5)

cleanprod <- cleanprodlist %>% reduce(full_join, by='Date')

cleanprod <- cleanprod %>% subset(select= c(Date,Production.x))

cleanprod <- cleanprod %>% rename(Production=Production.x)

In this step, we remove missing and invalid values. Here, we also restructure the data to make the Date and Production string values uniform for later conversion into the required analyzable formats. We then view the data to verify these structural changes.

#Removing missing values

cleanprod <- cleanprod %>% na.exclude(cleanprod)

sum(is.na(cleanprod))
## [1] 0
#Removing the dates that lack production (resulting from week 5 data)

cleanprod <- cleanprod %>% subset(Date!="1983 / " & Date!="1984 / " & Date!="1985 / " & Date!="1986 / " & Date!="1987 / " & Date!="1988 / " & Date!="1989 / " & Date!="1990 / " & Date!="1991 / " & Date!="1992 / " & Date!="1993 / " & Date!="1994 / " & Date!="1995 / " & Date!="1996 / " & Date!="1997 / " & Date!="1998 / " & Date!="1999 / " & Date!="2000 / " & Date!="2001 / " & Date!="2002 / " & Date!="2003 / " & Date!="2004 / " & Date!="2005 / " & Date!="2006 / " & Date!="2007 / " & Date!="2008 / " & Date!="2009 / " & Date!="2010 / " & Date!="2011 / " & Date!="2012 / " & Date!="2013 / " & Date!="2014 / " & Date!="2015 / " & Date!="2016 / " & Date!="2017 / " & Date!="2018 / " & Date!="2019 / " & Date!="2020 / " & Date!="2021 / ")

#Unifying date format

cleanprod <- cleanprod %>% mutate (Date = gsub ('\\ /', "/", Date))

cleanprod <- cleanprod %>% mutate (Date = gsub (" ", "", Date))

cleanprod <- cleanprod %>% mutate (Production = gsub (",", "", Production))

Converting the Date and Column values into POSIXct and numeric formats respectively to enable time series analysis and graphics.

cleanprod <- cleanprod %>% mutate (Date = as.POSIXct(Date, format = "%Y/%m/%d"))

cleanprod <-cleanprod %>% mutate (Production = as.numeric(Production))

Generating the interactive time series trendline for the cleaned weekly data

Weekly Oil Production Plot

You can also embed plots, for example:

## Warning: package 'plotly' was built under R version 4.1.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout